Follow the setup instructions from the CLTK tutorial here.
In [2]:
from cltk.corpus.utils.importer import CorpusImporter
In [3]:
my_greek_downloader = CorpusImporter('greek')
In [3]:
my_greek_downloader.import_corpus('tlg', '~/corpora/TLG_E/')
In [4]:
my_greek_downloader.import_corpus('greek_text_first1kgreek')
http://docs.cltk.org/en/latest/greek.html#converting-tlg-texts-with-tlgu
In [4]:
from cltk.corpus.greek.tlgu import TLGU
tlgu = TLGU()
tlgu.convert_corpus(corpus='tlg') # writes to: ~/cltk_data/greek/text/tlg/plaintext/
In [6]:
# If you get the following error: 'Install `bs4` and `lxml` to parse these TEI files.'
# then run: `pip install bs4 lxml`
from cltk.corpus.greek.tei import onekgreek_tei_xml_to_text
onekgreek_tei_xml_to_text()
# plaintext files will be available now at `~/cltk_data/greek/text/greek_text_first1kgreek_plaintext/`
In [5]:
!head ~/cltk_data/greek/text/tlg/plaintext/TLG0437.TXT
In [6]:
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
import os
In [7]:
plaintext_dir = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext/')
files = os.listdir(plaintext_dir)
for file in files:
file = os.path.join(plaintext_dir, file)
with open(file) as file_open:
file_read = file_open.read()
clean_text = tlg_plaintext_cleanup(file_read, rm_punctuation=True, rm_periods=False)
clean_text = clean_text.lower()
with open(file, 'w') as file_open:
file_open.write(clean_text)
In [8]:
!head ~/cltk_data/greek/text/tlg/plaintext/TLG0437.TXT